{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import OneHotEncoder,Binarizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression,LinearRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import accuracy_score,confusion_matrix,mean_squared_error,recall_score,roc_auc_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import joblib\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import RocCurveDisplay\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import precision_recall_curve\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n",
    "import jieba\n",
    "\n",
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "# df = pd.read_excel(\"C:\\\\Users\\\\Administrator\\\\Desktop\\\\月考练习算法题 (2)\\\\月考练习算法题\\\\第3套（修改2）\\\\专高6月考-03附件\\\\新闻.xlsx\")\n",
    "# df"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "# titles = df['标题']\n",
    "# titles\n",
    "#"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "# words  = []\n",
    "# for i,row in df.iterrows():\n",
    "# \tword = jieba.cut(row[\"标题\"])\n",
    "# \tresult = \"  \".join(word)\n",
    "# \twords.append(result)\n",
    "\n",
    "# words"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "# words1 = []\n",
    "# for i, row  in df.iterrows():\n",
    "# \twords1.append(\" \".join(jieba.cut(row[\"标题\"])))\n",
    "# \tif i == 2:\n",
    "# \t\tbreak\n",
    "#\n",
    "# print(words1)\n",
    "# words[:3]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# 4.建立词频矩阵\n",
    "# vect = CountVectorizer()\n",
    "# X = vect.fit_transform(words)\n",
    "#\n",
    "# X = X.toarray()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "# 5.查看词袋对应顺序\n",
    "# words_bag1 = vect.vocabulary_\n",
    "# words_bag1\n",
    "#查看词袋\n",
    "# words_bag2 = vect.get_feature_names_out()\n",
    "# words_bag2"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "# 6\n",
    "# vect = CountVectorizer()\n",
    "# X = vect.fit_transform(words)\n",
    "# X = X.toarray()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [],
   "source": [
    "# 7.\n",
    "# words_bag = vect.vocabulary_\n",
    "# words_bag\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "#8.文本向量化\n",
    "# X = vect.fit_transform(titles)\n",
    "# X = X.toarray()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%#\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "# vect.get_feature_names_out()\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "# words_bag = vect.vocabulary_\n",
    "# words_bag"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "#11\n",
    "#\n",
    "# words_bag2 = vect.get_feature_names_out()\n",
    "# df = pd.DataFrame(X, columns=words_bag2)\n",
    "# df.head()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "# 12\n",
    "# kms = KMeans(n_clusters=10, random_state=123)\n",
    "# k_data = kms.fit_predict(df)\n",
    "# k_data"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "# words_ary = np.array(words)\n",
    "# print(words_ary[k_data == 1])\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}